library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("GGally")
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(here)
## here() starts at /g/huber/users/fridljand/R/ihw-forest-paper
options(bitmapType ="cairo")

Estimate simulation hyper parameters based on real data

chr <- 21
chr_full_data_grubert <- loadRData(paste0("/g/scb/zaugg/zaugg/hQTL/GenotypeData/snps", chr, ".filtered.allGT.txt.rda"))

Data driven parameter for \(s_i\)

s_prob <- mean(chr_full_data_grubert[[1]])/2
s_prob

Same number of patients

nsample <- ncol(chr_full_data_grubert)
nsample

Read fitted modell

chr_pvalues_grubert <- data.table::fread(here(paste0("data/hqtl_chrom1_chrom2/results/cisQTLs_H3K27AC_chr",chr,".txt")))
head(chr_pvalues_grubert)
betha <- mean(chr_pvalues_grubert$beta)

sanity check

chr_pvalues_grubert$`p-value`[1]
2*(1-pt(abs(chr_pvalues_grubert$`t-stat`[1]), df = 74))

Simulation

We are following the notation in (Shabalin 2012). For a fixed SNP and a fixed histone modification peak, let \(s_i\in \{0,1,2\}\) be the frequency of the minor allele and \(g_i \in \mathbb{R}_{\geq 0}\) be the level of histone modification measured in individual \(i \in \{1,\ldots, n\}\). Let \(H \sim \mathop{Bernouilli}(1-\pi_0)\) indicate, if there actually is an association between \(s_i\) and \(g_i\). We model the relationship as follows:

\[ \begin{aligned} &g_i =\alpha+(H\beta) s_{i}+\epsilon_{i}.\end{aligned} \]

with \(s \sim \operatorname{Bin}(2,p)\) and \(\epsilon \sim \mathcal{N}\left(0, \sigma^2\right)\). We will be testing

\[ H_0: (H\beta) =0 \quad \text { against } \quad H_1: (H\beta) \neq 0. \]

Hyperparameters for simulation

ntrials = 1000
nsample = 76
pi0 = 0.9
#alpha should not play any role
simulate_data <- function(H, nsample = 76, s_prob = 0.26, alpha = 50, betha = 30, sigma = 2){
  data_sim <- data.frame(
    s_prob = s_prob,
    alpha = alpha,
    betha = betha, #true
    sigma = sigma,
    nsample_i = seq_len(nsample),
    #H   = rbinom(nsample,1,1-pi0), * H
  
    #TODO simulate alternatives
    s = rbinom(n = nsample, size = 2, s_prob),
    noise = rnorm(n = nsample, sd = sigma)
  )

  data_sim <- data_sim %>% 
    mutate(histone_modification = alpha + betha * s*H  + noise)
  return(data_sim)
}
H <- rbinom(ntrials, size = 1, 1-pi0)
data_sim <- lapply(H, simulate_data)

head(data_sim[[1]])
##   s_prob alpha betha sigma nsample_i s        noise histone_modification
## 1   0.26    50    30     2         1 0 -1.618697671             48.38130
## 2   0.26    50    30     2         2 0  1.359144989             51.35914
## 3   0.26    50    30     2         3 0  0.493898067             50.49390
## 4   0.26    50    30     2         4 2  0.732178152             50.73218
## 5   0.26    50    30     2         5 0 -2.733669559             47.26633
## 6   0.26    50    30     2         6 0 -0.009020177             49.99098

Minor allele frequency

Let \(\bar{s} := \frac{1}{n}\sum_{i=1}^n s_{i}\). Then the minor allele frequency can be defined as

\[ f:=\frac{1}{2 n} \min \left(\sum_{i=1}^n s_i, 2 n-\sum_{i=1}^n s_i\right)=\frac{1}{2} \min (\bar{s}, 2-\bar{s}) \in[0,0.5] \]

That is, of the $2n$ alleles present in $n$ individuals, what the proportion of the minor allele at the pre-specified SNP is.
minor_allele_frequencies_sim <- sapply(data_sim, function(data_sim_i){
  0.5* min(mean(data_sim_i$s), 2- mean(data_sim_i$s))
})

t-test

We test

\[ H_0: (H\beta) =0 \quad \text { against } \quad H_1: (H\beta) \neq 0. \]

Our test statistic is

\[ t_{\text {score }}=\frac{\widehat{\beta}}{S E_{\widehat{\beta}}}, \]

where \(\widehat{\beta}\) is the least square estimator for \(H\beta\) and \(S E_{\widehat{\beta}}\) is the associated standard error. We are following the notation from B7.13 in (Czado and Schmidt 2011). Under the null and assuming \(\epsilon_{1},\ldots , \epsilon_{n} \sim \text { i.i.d. } \mathcal{N}\left(0, \sigma^2\right)\) we have \(t_{\text {score }} \sim \mathcal{T}_{n-2}\).

sim_res <- lapply(data_sim, function(data_sim_i){
  t_test_lm = lm(histone_modification ~ 1 + s, data=data_sim_i)
  s_line_t_test_lm = summary(t_test_lm)$coefficients[2,]
  
  s_line_t_test_lm = data.frame(as.list(s_line_t_test_lm))
  
  #calculate minor_allele_frequency
  s_line_t_test_lm$minor_allele_frequency = 0.5* min(mean(data_sim_i$s), 2- mean(data_sim_i$s))
  
  s_line_t_test_lm$s_variance <- var(data_sim_i$s)
  
  s_line_t_test_lm
})

sim_res <- bind_rows(sim_res)
#sim_res <- sim_res %>% rename(pvalue = `Pr...t..`)
colnames(sim_res)[4] <- "pvalue"
sim_res$H <- as.factor(H)
head(sim_res)
##      Estimate Std..Error    t.value      pvalue minor_allele_frequency
## 1  0.10803028  0.3369821  0.3205817 0.749430959              0.2763158
## 2  0.05270469  0.3878867  0.1358765 0.892287851              0.2434211
## 3  1.07007858  0.4042360  2.6471629 0.009912786              0.2039474
## 4  0.39721912  0.4796317  0.8281754 0.410234260              0.2105263
## 5 -0.09696710  0.3994907 -0.2427268 0.808888564              0.1842105
## 6  0.89035418  0.4347057  2.0481768 0.044090778              0.2302632
##   s_variance H
## 1  0.4905263 0
## 2  0.3598246 0
## 3  0.3247368 0
## 4  0.3003509 0
## 5  0.2624561 0
## 6  0.3317544 0

Rationale for MAF

We don’t think that the minor allele frequency \(f\) informs about the hypothesis \(H\). We think that under the alternative \(H=1\) it is informative for our multiple testing procedure in the following way:

Higher MAF \(f =\frac{1}{2}\min \left( \bar{s},2- \bar{s}\right)\)

\(\Rightarrow\) Higher predictor variation \(\sqrt{\sum_{i=1}^n\left(s_i-\bar{s}\right)^2}\) (intuitive, both are measures of variation of \(s\))

\(\Rightarrow\) Smaller variation of \(\widehat{\beta}\) estimate \[ S E_{\widehat{\beta}}=\frac{\sqrt{\frac{1}{n-2} \sum_{i=1}^n\left(g_i-\bar{g}\right)^2}}{\sqrt{\sum_{i=1}^n\left(s_i-\bar{s}\right)^2}} \]

\(\Rightarrow\) Higher test statistic \[ t_{\text {score }}=\frac{\widehat{\beta} }{S E_{\widehat{\beta}}} \]

\(\Rightarrow\) Smaller p-value

\[P = 2(1-F_{H_0}(t_{\text {score }}))\]

Question: 1) Is above interpretation correct? 2) What happens under the null above?

Diagnostic plots

ggpairs(sim_res, 
        mapping=ggplot2::aes(colour = H, alpha = 0.3),
        columns= c(5,6,2,3,4), 
        title="correlogram with ggpairs()",
        lower=list(combo=wrap("facethist", binwidth=1))
      ) 

Pvalue histograms

groups_by_filter <- function(covariate, nbins, ties.method="random", seed=NULL){
  if (!is.null(seed) && ties.method=="random"){
    #http://stackoverflow.com/questions/14324096/setting-seed-locally-not-globally-in-r?rq=1
    tmp <- runif(1)
    old <- .Random.seed
    on.exit( { .Random.seed <<- old } )
    set.seed(as.integer(seed)) 
  }
    rfs <- rank(covariate, ties.method=ties.method)/length(covariate)
    as.factor(ceiling( rfs* nbins))
}
ggplot(sim_res, 
       aes(x = pvalue,
           fill = H)) +
  geom_histogram(boundary = 0) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(sim_res, 
       aes(x = pvalue,
           fill = H)) +
  geom_histogram(boundary = 0) +
  facet_wrap(groups_by_filter(sim_res$minor_allele_frequency, 8), nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Czado, Claudia, and Thorsten Schmidt. 2011. Mathematische Statistik. Berlin, Heidelberg: Springer Berlin Heidelberg. https://doi.org/10.1007/978-3-642-17261-8.
Shabalin, A. A. 2012. “Matrix eQTL: Ultra Fast eQTL Analysis via Large Matrix Operations.” Bioinformatics 28 (10): 1353–58. https://doi.org/10.1093/bioinformatics/bts163.